import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import time
import folium
from folium import plugins
apData = pd.read_csv('./Data/yelpData0-4927.csv')
apData = apData.dropna()
apData
apData.info()
apData.head(5)
apData.isnull().values.any()
apData.describe()
plt.figure(figsize=(20,2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=apData['Price'], color="green")
plt.figure(figsize=(20,2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=apData['NearbyPOIs'], color="green")
plt.figure(figsize=(20,2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=apData['Size'], color="green")
tolerance= 0.8
minNum = 0
column = 'Price'
#Remove Outliers above the 3rd Quantile
quant = apData[column].quantile(tolerance)
print(quant)
apData = apData[apData[column] < quant]
apData = apData[apData[column] > minNum]
print(apData.shape)
plt.figure(figsize=(20,2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=apData['Price'], color="green")
#Save File to get final data to work with in Model
apData.to_csv('./Data/yelpDataGather.csv', index=False)
apData.info()
apData = pd.read_csv('./Data/yelpDataGather.csv')
#Add two new columns for additional features in ML
#add PricePerKVM and PricePerRoom
fromRow = 0
toRow = 3913
apData['PricePerKvm'] = 0.0
apData['PricePerRoom'] = 0.0
for i in range(fromRow, toRow):
#df.to_csv(name, index=False)
apData.at[i, 'PricePerKvm'] = round(apData.at[i, 'Price'] / apData.at[i, 'Size'], 0)
apData.at[i, 'PricePerRoom'] = round(apData.at[i, 'Price'] / apData.at[i, 'Rooms'], 0)
apData['PricePerKvm'] = apData['PricePerKvm'].astype('float')
apData['PricePerRoom'] = apData['PricePerRoom'].astype('float')
apData.head(5)
#Save File to get final data to work with in Model
apData.to_csv('./Data/yelpDataGather.csv', index=False)
# Initializing the map to see coordinates gathered and if data is good or not
m = folium.Map([59.3508, 18.0973], zoom_start=11)
# mark each station as a point
for index, row in apData.iterrows():
folium.CircleMarker([row['Latitude'], row['Longitude']],
radius=15,
popup=row['PricePerKvm'],
fill_color="#3db7e4", # divvy color
).add_to(m)
m